{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import sys\n",
    "import warnings\n",
    "\n",
    "from functools import partial\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.stats import skew, kurtosis, iqr\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from sklearn.externals import joblib\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "\n",
    "sys.path.append('../')\n",
    "from src.utils import parallel_apply\n",
    "from src.feature_extraction import add_features_in_group\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "DIR = 'PATH/TO/YOUR/DATA'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
    "installments = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/installments_payments.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "installments.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "installments_one = installments[installments['SK_ID_CURR'] == 199697]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []\n",
    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
    "    for select in ['AMT_INSTALMENT',\n",
    "                   'AMT_PAYMENT',\n",
    "                   'DAYS_ENTRY_PAYMENT',\n",
    "                   'DAYS_INSTALMENT',\n",
    "                   'NUM_INSTALMENT_NUMBER',\n",
    "                   'NUM_INSTALMENT_VERSION'\n",
    "                   ]:\n",
    "        INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg))\n",
    "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "groupby_aggregate_names = []\n",
    "for groupby_cols, specs in tqdm(INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES):\n",
    "    group_object = installments.groupby(groupby_cols)\n",
    "    for select, agg in tqdm(specs):\n",
    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
    "        application = application.merge(group_object[select]\n",
    "                              .agg(agg)\n",
    "                              .reset_index()\n",
    "                              .rename(index=str,\n",
    "                                      columns={select: groupby_aggregate_name})\n",
    "                              [groupby_cols + [groupby_aggregate_name]],\n",
    "                              on=groupby_cols,\n",
    "                              how='left')\n",
    "        groupby_aggregate_names.append(groupby_aggregate_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
    "application_agg_corr = abs(application_agg.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solution 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "positive_ID = application[application['TARGET']==1]['SK_ID_CURR'].tolist()\n",
    "positive_ID[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "value_counts = installments[installments['SK_ID_CURR'].isin(positive_ID)]['SK_ID_CURR'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "value_counts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.distplot(value_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "installments_one = installments[installments['SK_ID_CURR']==328162]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "installments_one.sort_values(['DAYS_INSTALMENT'],ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# installments_ = installments[installments['SK_ID_CURR'].isin(positive_ID[:100])]\n",
    "installments_ = installments.sample(10000)\n",
    "installments_['instalment_paid_late_in_days'] = installments_['DAYS_ENTRY_PAYMENT'] - installments_['DAYS_INSTALMENT'] \n",
    "installments_['instalment_paid_late'] = (installments_['instalment_paid_late_in_days'] > 0).astype(int)\n",
    "installments_['instalment_paid_over_amount'] = installments_['AMT_PAYMENT'] - installments_['AMT_INSTALMENT']\n",
    "installments_['instalment_paid_over'] = (installments_['instalment_paid_over_amount'] > 0).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_features(feature_name, aggs, features, feature_names, groupby):\n",
    "    feature_names.extend(['{}_{}'.format(feature_name, agg) for agg in aggs])\n",
    "\n",
    "    for agg in aggs:\n",
    "        if agg == 'kurt':\n",
    "            agg_func = kurtosis\n",
    "        elif agg == 'iqr':\n",
    "            agg_func = iqr\n",
    "        else:\n",
    "            agg_func = agg\n",
    "        \n",
    "        g = groupby[feature_name].agg(agg_func).reset_index().rename(index=str,\n",
    "                                                                columns={feature_name: '{}_{}'.format(feature_name,\n",
    "                                                                                                      agg)})\n",
    "        features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "    return features, feature_names\n",
    "\n",
    "\n",
    "def add_features_in_group(features, gr_, feature_name, aggs, prefix):\n",
    "    for agg in aggs:\n",
    "        if agg == 'sum':\n",
    "            features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum()\n",
    "        elif agg == 'mean':\n",
    "            features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean()\n",
    "        elif agg == 'max':\n",
    "            features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max()\n",
    "        elif agg == 'min':\n",
    "            features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min()\n",
    "        elif agg == 'std':\n",
    "            features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std()\n",
    "        elif agg == 'count':\n",
    "            features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count()\n",
    "        elif agg == 'skew':\n",
    "            features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name])\n",
    "        elif agg == 'kurt':\n",
    "            features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name])\n",
    "        elif agg == 'iqr':\n",
    "            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])\n",
    "        elif agg == 'median':\n",
    "            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = pd.DataFrame({'SK_ID_CURR':installments_['SK_ID_CURR'].unique()})\n",
    "groupby = installments_.groupby(['SK_ID_CURR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "installments_.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## per id aggregations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = []\n",
    "\n",
    "features, feature_names = add_features('NUM_INSTALMENT_VERSION', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                     features, feature_names, groupby)\n",
    "\n",
    "features, feature_names = add_features('instalment_paid_late_in_days', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                     features, feature_names, groupby)\n",
    "\n",
    "features, feature_names = add_features('instalment_paid_late', ['sum','mean'],\n",
    "                                     features, feature_names, groupby)\n",
    "\n",
    "features, feature_names = add_features('instalment_paid_over_amount', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                     features, feature_names, groupby)\n",
    "\n",
    "features, feature_names = add_features('instalment_paid_over', ['sum','mean'],\n",
    "                                     features, feature_names, groupby)\n",
    "    \n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Per id k last installment information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_k_instalment_features(gr, periods):\n",
    "    gr_ = gr.copy()\n",
    "    features = {}\n",
    "\n",
    "    for period in periods:\n",
    "        gr_period = gr_[gr_['DAYS_INSTALMENT'] >= (-1) * period]\n",
    "\n",
    "        features = add_features_in_group(features,gr_period, 'NUM_INSTALMENT_VERSION', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        \n",
    "        features = add_features_in_group(features,gr_period, 'instalment_paid_late_in_days', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features = add_features_in_group(features,gr_period ,'instalment_paid_late', \n",
    "                                     ['count','mean'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features = add_features_in_group(features,gr_period ,'instalment_paid_over_amount', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features = add_features_in_group(features,gr_period,'instalment_paid_over', \n",
    "                                     ['count','mean'],\n",
    "                                         'last_{}_'.format(period))        \n",
    "    \n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func = partial(last_k_instalment_features, periods=[60, 180, 360, 720, 1500, 10e24])\n",
    "\n",
    "g = parallel_apply(groupby, func, index_name='SK_ID_CURR',\n",
    "                   num_workers=16, chunk_size=10000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "\n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_loan_features(gr):\n",
    "    gr_ = gr.copy()\n",
    "    last_installments_ids = gr_[gr_['DAYS_INSTALMENT']==gr_['DAYS_INSTALMENT'].max()]['SK_ID_PREV']\n",
    "    gr_ = gr_[gr_['SK_ID_PREV'].isin(last_installments_ids)]\n",
    "\n",
    "    features = {}\n",
    "    features = add_features_in_group(features, gr_,\n",
    "                                         'instalment_paid_late_in_days',\n",
    "                                         ['sum', 'mean', 'max', 'min', 'std'],\n",
    "                                         'last_loan_')\n",
    "    features = add_features_in_group(features, gr_,\n",
    "                                         'instalment_paid_late',\n",
    "                                         ['count', 'mean'],\n",
    "                                         'last_loan_')\n",
    "    features = add_features_in_group(features, gr_,\n",
    "                                         'instalment_paid_over_amount',\n",
    "                                         ['sum', 'mean', 'max', 'min', 'std'],\n",
    "                                         'last_loan_')\n",
    "    features = add_features_in_group(features, gr_,\n",
    "                                         'instalment_paid_over',\n",
    "                                         ['count', 'mean'],\n",
    "                                         'last_loan_')\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = parallel_apply(groupby, last_loan_features, index_name='SK_ID_CURR',\n",
    "                   num_workers=16, chunk_size=10000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "\n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## per id dynamic "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trend_in_last_k_instalment_features(gr, periods):\n",
    "    gr_ = gr.copy()\n",
    "    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)\n",
    "    \n",
    "    features = {}\n",
    "\n",
    "    for period in periods:\n",
    "        gr_period = gr_[gr_['DAYS_INSTALMENT'] >= (-1) * period]\n",
    "\n",
    "\n",
    "        features = _add_trend_feature(features,gr_period,\n",
    "                                      'instalment_paid_late_in_days','{}_period_trend_'.format(period)\n",
    "                                     )\n",
    "        features = _add_trend_feature(features,gr_period,\n",
    "                                      'instalment_paid_over_amount','{}_period_trend_'.format(period)\n",
    "                                     )\n",
    "    return features\n",
    "\n",
    "def _add_trend_feature(features,gr,feature_name, prefix):\n",
    "    y = gr[feature_name].values\n",
    "    try:\n",
    "        x = np.arange(0,len(y)).reshape(-1,1)\n",
    "        lr = LinearRegression()\n",
    "        lr.fit(x,y)\n",
    "        trend = lr.coef_[0]\n",
    "    except:\n",
    "        trend=np.nan\n",
    "    features['{}{}'.format(prefix,feature_name)] = trend\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func = partial(trend_in_last_k_instalment_features, periods=[60, 180, 360, 720, 1500])\n",
    "\n",
    "g = parallel_apply(groupby, func, index_name='SK_ID_CURR',\n",
    "                   num_workers=16, chunk_size=10000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "\n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features, on='SK_ID_CURR',how='left')\n",
    "X = X[features.columns.drop('SK_ID_CURR').tolist()+['TARGET']]\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_corr = abs(X.corr())\n",
    "X_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solution 5\n",
    "## Period fractions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_k_instalment_features_with_fractions(gr, periods, fraction_periods):\n",
    "    gr_ = gr.copy()\n",
    "    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)\n",
    "    \n",
    "    features = {}\n",
    "\n",
    "    for period in periods:\n",
    "        gr_period = gr_.iloc[:period]\n",
    "\n",
    "        features = add_features_in_group(features,gr_period, 'NUM_INSTALMENT_VERSION', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        \n",
    "        features = add_features_in_group(features,gr_period, 'instalment_paid_late_in_days', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features = add_features_in_group(features,gr_period ,'instalment_paid_late', \n",
    "                                     ['count','mean'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features = add_features_in_group(features,gr_period ,'instalment_paid_over_amount', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features = add_features_in_group(features,gr_period,'instalment_paid_over', \n",
    "                                     ['count','mean'],\n",
    "                                         'last_{}_'.format(period))        \n",
    "    \n",
    "    for short_period, long_period in fraction_periods:\n",
    "        short_feature_names = _get_feature_names(features, short_period)\n",
    "        long_feature_names = _get_feature_names(features, long_period)\n",
    "        \n",
    "        for short_feature, long_feature in zip(short_feature_names, long_feature_names):\n",
    "            old_name_chunk = '_{}_'.format(short_period)\n",
    "            new_name_chunk ='_{}by{}_fraction_'.format(short_period, long_period)\n",
    "            fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)\n",
    "            features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature])\n",
    "    return pd.Series(features)\n",
    "\n",
    "def _get_feature_names(features, period):\n",
    "    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])\n",
    "\n",
    "\n",
    "def safe_div(a,b):\n",
    "    try:\n",
    "        return float(a)/float(b)\n",
    "    except:\n",
    "        return 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func = partial(last_k_instalment_features_with_fractions, \n",
    "               periods=[1,5,10,20,50,100],\n",
    "               fraction_periods=[(5,20),(5,50),(10,100)])\n",
    "\n",
    "g = parallel_apply(groupby, func, index_name='SK_ID_CURR',\n",
    "                   num_workers=16, chunk_size=1000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "\n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features, on='SK_ID_CURR',how='left')\n",
    "X = X[features.columns.drop('SK_ID_CURR').tolist()+['TARGET']]\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X_corr = abs(X.corr())\n",
    "X_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "features.columns[100:200]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solution 5\n",
    "## Period fractions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_k_instalment_features_with_fractions(gr, periods, fraction_periods):\n",
    "    gr_ = gr.copy()\n",
    "    \n",
    "    features = {}\n",
    "    features_temp = {}\n",
    "\n",
    "    for period in periods:\n",
    "        gr_period = gr_[gr_['DAYS_INSTALMENT'] >= (-1) * period]\n",
    "\n",
    "        features_temp = add_features_in_group(features_temp,gr_period, 'NUM_INSTALMENT_VERSION', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        \n",
    "        features_temp = add_features_in_group(features_temp,gr_period, 'instalment_paid_late_in_days', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features_temp = add_features_in_group(features_temp,gr_period ,'instalment_paid_late', \n",
    "                                     ['count','mean'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features_temp = add_features_in_group(features_temp,gr_period ,'instalment_paid_over_amount', \n",
    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
    "                                         'last_{}_'.format(period))\n",
    "        features_temp = add_features_in_group(features_temp,gr_period,'instalment_paid_over', \n",
    "                                     ['count','mean'],\n",
    "                                         'last_{}_'.format(period))        \n",
    "    \n",
    "    for short_period, long_period in fraction_periods:\n",
    "        short_feature_names = _get_feature_names(features_temp, short_period)\n",
    "        long_feature_names = _get_feature_names(features_temp, long_period)\n",
    "        \n",
    "        for short_feature, long_feature in zip(short_feature_names, long_feature_names):\n",
    "            old_name_chunk = '_{}_'.format(short_period)\n",
    "            new_name_chunk ='_{}by{}_fraction_'.format(short_period, long_period)\n",
    "            fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)\n",
    "            features[fraction_feature_name] = safe_div(features_temp[short_feature], features_temp[long_feature])\n",
    "    return pd.Series(features)\n",
    "\n",
    "def _get_feature_names(features, period):\n",
    "    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])\n",
    "\n",
    "\n",
    "def safe_div(a,b):\n",
    "    try:\n",
    "        return float(a)/float(b)\n",
    "    except:\n",
    "        return 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func = partial(last_k_instalment_features_with_fractions, \n",
    "               periods=[60, 180, 360, 720, 1500],\n",
    "               fraction_periods=[(60,180),(60,360),(180,1500),(360,1500)])\n",
    "\n",
    "g = parallel_apply(groupby, func, index_name='SK_ID_CURR',\n",
    "                   num_workers=16, chunk_size=1000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "\n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features, on='SK_ID_CURR',how='left')\n",
    "X = X[features.columns.drop('SK_ID_CURR').tolist()+['TARGET']]\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X_corr = abs(X.corr())\n",
    "X_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "regex = 'last_loan.*instalment_paid_over.*'\n",
    "X_corr_truncated = X_corr.sort_values('TARGET', ascending=False).filter(regex=regex, axis=0)\n",
    "X_corr_truncated['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "colnames = X_corr_truncated.index.tolist() + ['TARGET']\n",
    "X_corr_truncated[colnames]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(X_corr_truncated[colnames], \n",
    "            xticklabels=colnames,\n",
    "            yticklabels=colnames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "installments"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
